#!/usr/bin/env python3

# This script is used to generate a RDF graph from the list of packages and manpages

import gzip
import json
import os

from rdflib import BNode, Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF

DIST = 'testing'

mangraph = Graph ()
MANPAGE = Namespace ('urn:manpage:')
MANPAGE_TERMS = Namespace ('urn:manpage:terms:')
SCHEMA = Namespace ('http://schema.org/')
mangraph.bind ('manpage', MANPAGE)
mangraph.bind ('manpage-terms', MANPAGE_TERMS)
mangraph.bind ('schema', SCHEMA)

# Add sections to the graph
def add_section_info (number, description):
    section_n = 'section' + number
    
    mangraph.add ((MANPAGE[section_n],
                   RDF.type,
                   MANPAGE_TERMS.Section))
    mangraph.add ((MANPAGE[section_n],
                   SCHEMA.name,
                   Literal ('Section ' + number)))
    mangraph.add ((MANPAGE[section_n],
                   MANPAGE_TERMS.section_number,
                   Literal (number)))
    mangraph.add ((MANPAGE[section_n],
                   SCHEMA.disambiguatingDescription,
                   Literal (description, lang='en')))

add_section_info ('1', 'Executable programs or shell commands')
add_section_info ('2', 'System calls (functions provided by the kernel)')
add_section_info ('3', 'Library calls (functions within program libraries)')
add_section_info ('4', 'Special files')
add_section_info ('5', 'File formats and conventions')
add_section_info ('6', 'Games')
add_section_info ('7', 'Miscellaneous')
add_section_info ('8', 'System administration commands')

with open ('manpages.json', 'rt') as fp:
    packages = json.load (fp)

# Loop packages
for package in packages:
    
    rdfpackage = URIRef (MANPAGE + 'debian-testing/amd64/' + package['name'])
    mangraph.add ((rdfpackage,
                   RDF.type,
                   MANPAGE_TERMS.Package))
    mangraph.add ((rdfpackage,
                   SCHEMA.name,
                   Literal (package['name'])))
    mangraph.add ((rdfpackage,
                   SCHEMA.version,
                   Literal (package['version'])))
    mangraph.add ((rdfpackage,
                   MANPAGE_TERMS.architecture,
                   Literal (package['architecture'])))
    mangraph.add ((rdfpackage,
                   MANPAGE_TERMS.filename,
                   Literal (package['deb'])))
    
    # Loop manpages for this package
    for page in package['manpages']:
        
        subject = URIRef (MANPAGE + page['identifier'])
        
        # Is this page just a link to another page or not?
        if page['link_to'] is None:
            mangraph.add ((subject,
                           RDF.type,
                           SCHEMA.TextDigitalDocument))
            mangraph.add ((subject,
                           SCHEMA.name,
                           Literal (page['name'])))
            mangraph.add ((subject,
                           SCHEMA.identifier,
                           Literal (page['identifier'])))
            # Sections are 1..8 but can have variants such as "1p"
            mangraph.add ((subject,
                           MANPAGE_TERMS.section,
                           URIRef (MANPAGE.section + page['section'])))
            mangraph.add ((subject,
                           MANPAGE_TERMS.section_variant,
                           Literal (page['section_varinat'])))
            
            # Link to package
            mangraph.add ((subject, SCHEMA.isPartOf, rdfpackage))
            
            # Extract description from manpage
            # Example: ls - list directory contents
            page_txt = 'man/man' + page['section'] + '/' + page['identifier'] + '.txt'
            
            # Read TXT version, find NAME section, then read next line
            if os.path.isfile (page_txt):
                name_section_found = False
                description = ''
                
                with open (page_txt, 'rt') as f:
                    for line in f:
                        if name_section_found:
                            # Multiline NAME section
                            if line.startswith (' '):
                                description += line
                            
                            # NAME section terminated
                            else:
                                # There can be different kind of separators
                                # Usually it's '—' to split name/description, but others
                                # possibilities are '--', '-', ',', others?
                                if ' — ' in description:
                                    description = description.split (' — ', 1)[1].strip ()
                                elif ' -- ' in description:
                                    description = description.split (' -- ', 1)[1].strip ()
                                elif ' - ' in description:
                                    description = description.split (' - ', 1)[1].strip ()
                                elif ',' in description:
                                    description = description.split (',', 1)[1].strip ()
                                else:
                                    # Tough luck! No description for this page :(
                                    description = ''
                                
                                break
                        
                        if line == 'NAME\n':
                            name_section_found = True
                                
                mangraph.add ((subject,
                            SCHEMA.disambiguatingDescription,
                            Literal (description)))
            else:
                print ('Warning: file not found ', page_txt)
            
        else:
            
            mangraph.add ((URIRef (MANPAGE + page['link_to']),
                           SCHEMA.alternateName,
                           Literal (page['identifier'])))

mangraph.serialize ('manpages.ttl', 'turtle')
